W3:Sequence models

1 LSTMs

1.1 download data

subwords positive/negative IMDB movie review

Code

import tensorflow_datasets as tfds

# Download the subword encoded pretokenized dataset
dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True)

# Get the tokenizer
tokenizer = info.features['text'].encoder

1.2 make training and testing data

Code

BUFFER_SIZE = 10000
BATCH_SIZE = 256

# Get the train and test splits
train_data, test_data = dataset['train'], dataset['test'], 

# Shuffle the training data
train_dataset = train_data.shuffle(BUFFER_SIZE)

# Batch and pad the datasets to the maximum length of the sequences
train_dataset = train_dataset.padded_batch(BATCH_SIZE)
test_dataset = test_data.padded_batch(BATCH_SIZE)

1.3 define model

Code

import tensorflow as tf

# Hyperparameters
embedding_dim = 32
lstm_dim = 32
dense_dim = 16

# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(tokenizer.vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim,return_sequence=True)),
      tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

Code

# Print the model summary
model.summary()

1.4 compile model

Code

# Set the training parameters
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

1.5 training model

Code

NUM_EPOCHS = 2

history = model.fit(train_dataset, epochs=NUM_EPOCHS, validation_data=test_dataset)

1.6 model result

Code

import matplotlib.pyplot as plt

# Plot utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and results 
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

2 resource:

https://www.coursera.org/learn/natural-language-processing-tensorflow

https://github.com/https-deeplearning-ai/tensorflow-1-public/tree/main/C3

--- title: "W3:Sequence models" execute: warning: false error: false format: html: toc: true toc-location: right code-fold: show code-tools: true number-sections: true code-block-bg: true code-block-border-left: "#31BAE9" --- Week3 Sequence models In the last couple of weeks you looked first at Tokenizing words to get numeric values from them, and then using Embeddings to group words of similar meaning depending on how they were labelled. This gave you a good, but rough, sentiment analysis -- words such as 'fun' and 'entertaining' might show up in a positive movie review, and 'boring' and 'dull' might show up in a negative one. But sentiment can also be determined by the sequence in which words appear. For example, you could have 'not fun', which of course is the opposite of 'fun'. This week you'll start digging into a variety of model formats that are used in training models to understand context in sequence! ```{python} import tensorflow as tf ``` ```{python} import sys print(sys.version) ``` ```{python} import os os.system('pip3 show tensorflow') ``` ```{python} import os os.system('pip3 show keras') ``` # LSTMs ## download data subwords positive/negative IMDB movie review ```{python} import tensorflow_datasets as tfds # Download the subword encoded pretokenized dataset dataset, info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True) # Get the tokenizer tokenizer = info.features['text'].encoder ``` ## make training and testing data ```{python} BUFFER_SIZE = 10000 BATCH_SIZE = 256 # Get the train and test splits train_data, test_data = dataset['train'], dataset['test'], # Shuffle the training data train_dataset = train_data.shuffle(BUFFER_SIZE) # Batch and pad the datasets to the maximum length of the sequences train_dataset = train_dataset.padded_batch(BATCH_SIZE) test_dataset = test_data.padded_batch(BATCH_SIZE) ``` ## define model ```{python} import tensorflow as tf # Hyperparameters embedding_dim = 32 lstm_dim = 32 dense_dim = 16 # Build the model model = tf.keras.Sequential([ tf.keras.layers.Embedding(tokenizer.vocab_size, 32), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)), tf.keras.layers.Dense(16, activation='relu'), tf.keras.layers.Dense(1, activation='sigmoid') ]) ``` ```{python} # Print the model summary model.summary() ``` ## compile model ```{python} # Set the training parameters model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) ``` ## training model ```{python} NUM_EPOCHS = 3 history = model.fit(train_dataset, epochs=NUM_EPOCHS, validation_data=test_dataset) ``` ## model result ```{python} import matplotlib.pyplot as plt # Plot utility def plot_graphs(history, string): plt.plot(history.history[string]) plt.plot(history.history['val_'+string]) plt.xlabel("Epochs") plt.ylabel(string) plt.legend([string, 'val_'+string]) plt.show() # Plot the accuracy and results plot_graphs(history, "accuracy") plot_graphs(history, "loss") ``` # save model ```{python} # Save the entire model as a `.keras` zip archive. from keras.models import load_model, save_model save_model(model,'c3week3_movie_review_model.keras') ``` # load model ```{python} new_model = load_model('c3week3_movie_review_model.keras') ``` ```{python} new_model.summary() ``` # resource: https://www.coursera.org/learn/natural-language-processing-tensorflow https://github.com/https-deeplearning-ai/tensorflow-1-public/tree/main/C3